LSTM


In [1]:
from tensorflow import keras

In [4]:
import nltk
nltk.download('gutenberg')


[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\mcama\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\gutenberg.zip.
Out[4]:
True

In [5]:
from nltk.corpus import gutenberg

gutenberg.fileids()


Out[5]:
['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [58]:
text = gutenberg.raw("melville-moby_dick.txt").lower()

In [59]:
print(f"Corpus length: {len(text.split())}")


Corpus length: 212030

In [68]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=None, 
                      filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n', 
                      lower=True, 
                      split=' ', 
                      char_level=False, 
                      oov_token=None, 
                      document_count=0)

tokenizer.fit_on_texts(text)

In [69]:
sequences = tokenizer.texts_to_sequences([text])

In [70]:
sequences[:5]


Out[70]:
[[14,
  14,
  14,
  14,
  3,
  3,
  14,
  14,
  6,
  3,
  14,
  3,
  14,
  14,
  14,
  14,
  14,
  14,
  14,
  3,
  14,
  3,
  3,
  3,
  14,
  3,
  6,
  6,
  14,
  14,
  14,
  14,
  3,
  14,
  3,
  14,
  14,
  14,
  14,
  14,
  3,
  3,
  14,
  3,
  14,
  14,
  14,
  14,
  14,
  14,
  14,
  3,
  14,
  14,
  3,
  14,
  14,
  3,
  14,
  3,
  14,
  3,
  14,
  14,
  3,
  3,
  3,
  14,
  3,
  3,
  14,
  14,
  3,
  14,
  14,
  3,
  14,
  3,
  3,
  3,
  6,
  3,
  3,
  3,
  3,
  14,
  6,
  19,
  3,
  3,
  14,
  6,
  6,
  14,
  14,
  6,
  3,
  14,
  3,
  14,
  14,
  14,
  3,
  14,
  3,
  3,
  14,
  14,
  14,
  14,
  3,
  14,
  14,
  14,
  14,
  14,
  3,
  14,
  3,
  3,
  14,
  14,
  3,
  14,
  3,
  14,
  14,
  14,
  3,
  3,
  14,
  3,
  3,
  14,
  3,
  3,
  3,
  3,
  14,
  3,
  3,
  14,
  3,
  3,
  3,
  3,
  14,
  14,
  6,
  3,
  14,
  14,
  14,
  3,
  14,
  3,
  3,
  14,
  3,
  14,
  3,
  4,
  3,
  3,
  14,
  14,
  3,
  3,
  14,
  3,
  3,
  3,
  14,
  14,
  19,
  14,
  3,
  14,
  14,
  3,
  3,
  14,
  3,
  14,
  14,
  3,
  14,
  3,
  14,
  3,
  3,
  3,
  14,
  6,
  19,
  3,
  14,
  3,
  14,
  14,
  3,
  14,
  14,
  14,
  14,
  14,
  14,
  6,
  6,
  3,
  6,
  3,
  6,
  6,
  3,
  6,
  6,
  6,
  14,
  3,
  14,
  3,
  3,
  14,
  3,
  14,
  3,
  3,
  3,
  3,
  14,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  14,
  6,
  6,
  6,
  6,
  3,
  3,
  3,
  3,
  3,
  3,
  6,
  6,
  3,
  6,
  3,
  3,
  3,
  6,
  6,
  6,
  3,
  3,
  6,
  3,
  6,
  14,
  6,
  6,
  3,
  3,
  3,
  3,
  3,
  6,
  3,
  3,
  3,
  14,
  3,
  6,
  3,
  6,
  3,
  14,
  6,
  3,
  3,
  3,
  6,
  3,
  3,
  14,
  6,
  3,
  19,
  3,
  6,
  3,
  6,
  14,
  14,
  14,
  14,
  6,
  3,
  6,
  6,
  6,
  3,
  6,
  3,
  14,
  3,
  3,
  6,
  6,
  6,
  3,
  14,
  3,
  14,
  14,
  14,
  14,
  14,
  14,
  6,
  3,
  6,
  6,
  14,
  3,
  3,
  3,
  14,
  3,
  3,
  6,
  3,
  6,
  3,
  3,
  6,
  6,
  3,
  6,
  6,
  3,
  14,
  6,
  3,
  6,
  3,
  6,
  6,
  14,
  3,
  3,
  6,
  3,
  3,
  3,
  6,
  6,
  3,
  3,
  14,
  3,
  3,
  3,
  3,
  3,
  14,
  6,
  3,
  3,
  3,
  3,
  14,
  3,
  3,
  6,
  14,
  3,
  3,
  3,
  3,
  3,
  6,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  14,
  14,
  3,
  3,
  14,
  3,
  3,
  14,
  14,
  14,
  14,
  14,
  14,
  3,
  3,
  3,
  14,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  14,
  3,
  6,
  3,
  14,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  14,
  3,
  3,
  3,
  3,
  3,
  3,
  19,
  3,
  14,
  3,
  3,
  3,
  3,
  14,
  6,
  3,
  3,
  3,
  6,
  6,
  3,
  3,
  3,
  6,
  3,
  14,
  6,
  6,
  3,
  6,
  3,
  3,
  6,
  14,
  3,
  14,
  6,
  3,
  3,
  3,
  6,
  14,
  3,
  3,
  3,
  14,
  3,
  14,
  6,
  14,
  3,
  3,
  14,
  14,
  14,
  6,
  6,
  6,
  6,
  14,
  6,
  3,
  14,
  3,
  6,
  3,
  3,
  14,
  3,
  3,
  3,
  3,
  3,
  3,
  14,
  14,
  6,
  3,
  6,
  3,
  3,
  6,
  3,
  3,
  3,
  3,
  3,
  14,
  6,
  3,
  14,
  3,
  3,
  6,
  3,
  3,
  3,
  6,
  3,
  3,
  14,
  6,
  6,
  6,
  6,
  14,
  6,
  14,
  6,
  3,
  3,
  3,
  3,
  6,
  6,
  6,
  3,
  14,
  6,
  3,
  3,
  3,
  6,
  3,
  6,
  3,
  3,
  6,
  14,
  3,
  6,
  6,
  14,
  3,
  6,
  3,
  14,
  14,
  3,
  14,
  3,
  6,
  14,
  3,
  3,
  14,
  6,
  14,
  14,
  14,
  6,
  14,
  3,
  3,
  14,
  3,
  14,
  14,
  6,
  14,
  6,
  14,
  6,
  6,
  3,
  3,
  3,
  6,
  6,
  3,
  6,
  3,
  14,
  3,
  3,
  3,
  3,
  6,
  3,
  19,
  6,
  3,
  3,
  14,
  6,
  3,
  3,
  3,
  14,
  3,
  14,
  3,
  6,
  3,
  3,
  3,
  3,
  3,
  14,
  6,
  3,
  3,
  3,
  3,
  14,
  3,
  3,
  6,
  14,
  6,
  6,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  14,
  6,
  6,
  3,
  3,
  3,
  6,
  3,
  6,
  3,
  3,
  3,
  6,
  3,
  6,
  3,
  6,
  3,
  14,
  6,
  6,
  6,
  3,
  6,
  6,
  14,
  6,
  3,
  3,
  6,
  3,
  3,
  3,
  3,
  14,
  6,
  3,
  3,
  6,
  6,
  3,
  3,
  6,
  3,
  3,
  6,
  6,
  3,
  3,
  6,
  3,
  3,
  6,
  3,
  6,
  3,
  3,
  3,
  6,
  3,
  3,
  3,
  3,
  6,
  3,
  3,
  6,
  6,
  3,
  14,
  6,
  6,
  6,
  6,
  6,
  6,
  14,
  6,
  3,
  3,
  3,
  3,
  6,
  3,
  3,
  14,
  3,
  3,
  3,
  3,
  6,
  3,
  3,
  6,
  3,
  3,
  6,
  3,
  14,
  6,
  3,
  3,
  3,
  19,
  3,
  3,
  3,
  3,
  14,
  6,
  6,
  14,
  6,
  3,
  6,
  6,
  3,
  14,
  6,
  6,
  14,
  1,
  14,
  14,
  6,
  6,
  6,
  14,
  3,
  14,
  3,
  14,
  6,
  3,
  6,
  14,
  14,
  3,
  6,
  3,
  6,
  6,
  3,
  6,
  6,
  3,
  3,
  14,
  6,
  3,
  6,
  14,
  6,
  3,
  14,
  14,
  6,
  14,
  14,
  14,
  14,
  14,
  14,
  6,
  6,
  3,
  6,
  6,
  6,
  6,
  6,
  14,
  6,
  3,
  6,
  3,
  3,
  3,
  6,
  6,
  6,
  6,
  3,
  3,
  6,
  3,
  14,
  6,
  6,
  3,
  3,
  6,
  6,
  3,
  6,
  6,
  3,
  6,
  6,
  3,
  3,
  6,
  6,
  3,
  6,
  6,
  6,
  6,
  6,
  6,
  14,
  6,
  6,
  6,
  6,
  6,
  3,
  3,
  3,
  3,
  3,
  3,
  6,
  3,
  6,
  3,
  6,
  3,
  6,
  6,
  3,
  14,
  3,
  6,
  6,
  3,
  3,
  3,
  3,
  14,
  3,
  6,
  6,
  3,
  6,
  3,
  3,
  3,
  6,
  3,
  14,
  3,
  14,
  14,
  14,
  14,
  14,
  14,
  6,
  6,
  3,
  14,
  3,
  3,
  3,
  14,
  6,
  3,
  3,
  14,
  3,
  3,
  3,
  3,
  3,
  3,
  14,
  3,
  14,
  3,
  3,
  6,
  3,
  14,
  6,
  3,
  3,
  3,
  3,
  14,
  14,
  6,
  3,
  14,
  14,
  14,
  14,
  14,
  14,
  6,
  3,
  14,
  3,
  14,
  3,
  3,
  3,
  3,
  3,
  14,
  3,
  6,
  3,
  3,
  14,
  3,
  3,
  3,
  3,
  14,
  3,
  14,
  3,
  3,
  3,
  14,
  3,
  14,
  14,
  14,
  14,
  14,
  14,
  14,
  3,
  3,
  6,
  6,
  ...]]

In [71]:
tokenizer.index_word


Out[71]:
{1: 'e',
 2: 't',
 3: 'a',
 4: 'o',
 5: 'n',
 6: 'i',
 7: 's',
 8: 'h',
 9: 'r',
 10: 'l',
 11: 'd',
 12: 'u',
 13: 'm',
 14: '\r',
 15: 'c',
 16: 'w',
 17: 'f',
 18: 'g',
 19: ',',
 20: 'p',
 21: 'b',
 22: 'y',
 23: 'v',
 24: 'k',
 25: '.',
 26: ';',
 27: '"',
 28: "'",
 29: '!',
 30: 'q',
 31: 'j',
 32: 'x',
 33: '?',
 34: 'z',
 35: ':',
 36: '1',
 37: '0',
 38: '2',
 39: '8',
 40: '5',
 41: '7',
 42: '3',
 43: '4',
 44: '6',
 45: '9'}

Create training and test dataset


In [72]:
import numpy as np

features = []
labels = []

training_length = 50

# Create multiple training examples from each sequence
for i in range(training_length, len(sequences)):

    # Extract the features and label
    extract = sequences[i - training_length:i + 1]

    # Set the features and label
    features.append(extract[:-1])
    labels.append(extract[-1])
        
features = np.array(features)

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: